# IPR (Internet Policy Review) Downloader
# download_ipr_yearly.py
# -------------------------------------------------
# Automates downloading PDFs from Internet Policy Review yearly archive pages
# - Parses all article entries from a given yearly archive URL
# - Skips non-article categories such as "News" and "Opinion"
# - Visits each article page to locate the correct PDF link in the <aside> sidebar
# - Ensures only sidebar "piwik_download" PDFs are downloaded (avoids references)
# - Handles relative URLs via urljoin and skips invalid or non-PDF responses
# - Creates dynamic folder names like IPR_2020 for organized storage
# - Sanitizes filenames to be Windows-safe and logs all downloads to CSV
# - Works for all years available on the IPR archive site
# IPR (Internet Policy Review) Year Archive Downloader — STRICT HOSTED PDF
# download_ipr_year_live.py
# -------------------------------------------------
# Downloads ONLY the official IPR PDF per article, avoiding PDFs in references.
# - Input: https://policyreview.info/archives/YYYY
# - Archive parse: <ul class="list__item__ul list__item__ul--volumes"> ... <li> ... <h3><a href="/...">Title</a>
# - Article parse: take the sidebar PDF button:
#     aside#right .block-ipr-analysis-block .content a.piwik_download[type="application/pdf"][href^="/pdf/"]
#   (must be on same host and match /pdf/policyreview-<year>-*.pdf)
# - Output: ./IPR_{year}/<Title>.pdf + CSV log
#
# Defaults:
#   • Auto-skip News/Opinion items (usually have no hosted PDF)
# Options:
#   --dry-run             : list items + chosen hosted PDF (no download)
#   --max N               : download at most N PDFs
#   --delay S             : sleep S seconds between downloads
#   --allow-external-pdfs : if no hosted PDF, attempt first external PDF
#   --include-opinion     : include opinion/news items (overrides auto-skip)
#   --debug-pdf           : print all candidate PDF links found per article



import re
import csv
import sys
import time
import argparse
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

HEADERS_BASE = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    )
}
TIMEOUT = 60
RETRY_SLEEP = 2
MAX_RETRIES = 3

SCRIPT_DIR = Path(__file__).resolve().parent


def sanitize_filename(name: str) -> str:
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = re.sub(r"\.+", ".", name).strip(". ")
    return name[:180]


def get_soup(url: str, referer: str | None = None) -> BeautifulSoup:
    headers = dict(HEADERS_BASE)
    if referer:
        headers["Referer"] = referer
    last_exc = None
    for _ in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=headers, timeout=TIMEOUT)
            r.raise_for_status()
            return BeautifulSoup(r.text, "html.parser")
        except Exception as e:
            last_exc = e
            time.sleep(RETRY_SLEEP)
    raise last_exc


def detect_base(url: str, soup: BeautifulSoup) -> str:
    base = soup.find("base")
    if base and base.get("href"):
        return base["href"].strip()
    p = urlparse(url)
    return f"{p.scheme}://{p.netloc}/"


def parse_year(url: str, soup: BeautifulSoup) -> str:
    m = re.search(r"/archives/(\d{4})", url)
    if m:
        return m.group(1)
    title = soup.title.get_text(" ", strip=True) if soup.title else ""
    m = re.search(r"(19|20)\d{2}", title)
    if m:
        return m.group(0)
    m = re.search(r"(19|20)\d{2}", soup.get_text(" ", strip=True))
    return m.group(0) if m else "Year"


def collect_items_from_archive(soup: BeautifulSoup, base_url: str, include_opinion: bool):
    """
    Return list of (title, article_url, li_classes).
    Auto-skip Opinion/News items unless include_opinion=True.
    """
    results = []
    lists = soup.select("ul.list__item__ul--volumes")
    for ul in lists:
        for li in ul.select("li"):
            a = li.select_one("h3 a[href]")
            if not a:
                continue
            href = (a.get("href") or "").strip()
            title = a.get_text(" ", strip=True)
            if not href or not title or not href.startswith("/"):
                continue
            abs_url = urljoin(base_url, href)
            classes = set(li.get("class", []))

            # Auto-skip News/Opinion:
            # - class contains 'opinion' or 'news--opinion'
            # - OR path starts with /articles/news/
            is_opinion_news = ("opinion" in classes) or ("news--opinion" in classes) or abs_url.lower().startswith(
                f"{base_url.rstrip('/')}/articles/news/"
            )
            if is_opinion_news and not include_opinion:
                continue

            results.append((title, abs_url, classes))

    # de-dup
    seen, uniq = set(), []
    for t, u, cls in results:
        if u in seen:
            continue
        seen.add(u)
        uniq.append((t, u, cls))
    return uniq


def choose_hosted_pdf(article_url: str, year_hint: str, debug_pdf: bool = False):
    """
    Return (hosted_pdf, externals) where hosted_pdf is the official IPR PDF:
      aside#right .block-ipr-analysis-block .content a.piwik_download[type='application/pdf'][href^='/pdf/']
    matching /pdf/policyreview-<year>-*.pdf on the same host. externals list is for logging only.
    """
    soup = get_soup(article_url, referer=article_url)
    base = detect_base(article_url, soup)
    host = urlparse(article_url).netloc

    pat = re.compile(rf"/pdf/policyreview-{re.escape(year_hint)}-\d+-\d+\.pdf$")

    hosted_pdf = None
    externals = []

    def norm(u: str) -> str:
        return urljoin(base, u)

    # 1) Exact sidebar button
    for a in soup.select('aside#right .block-ipr-analysis-block .content a.piwik_download[type="application/pdf"][href]'):
        href = (a.get("href") or "").strip()
        absu = norm(href)
        if urlparse(absu).netloc == host and pat.search(href):
            hosted_pdf = absu
            break

    # 2) rel="alternate" type="application/pdf" (same host & pattern)
    if not hosted_pdf:
        for link in soup.select("link[rel='alternate'][type='application/pdf'][href]"):
            href = (link.get("href") or "").strip()
            absu = norm(href)
            if urlparse(absu).netloc == host and pat.search(href):
                hosted_pdf = absu
                break

    # 3) Any /pdf/... in the right sidebar that matches the pattern
    if not hosted_pdf:
        for a in soup.select("aside#right a[href^='/pdf/']"):
            href = (a.get("href") or "").strip()
            absu = norm(href)
            if urlparse(absu).netloc == host and pat.search(href):
                hosted_pdf = absu
                break

    # Collect externals for diagnostics (not used unless --allow-external-pdfs)
    for a in soup.select("a[href$='.pdf']"):
        href = (a.get("href") or "").strip()
        absu = norm(href)
        if not pat.search(href) or urlparse(absu).netloc != host:
            externals.append(absu)

    if debug_pdf:
        print("    [DEBUG] Hosted:", hosted_pdf)
        for i, e in enumerate(externals, 1):
            print(f"    [DEBUG] External[{i}]: {e}")

    return hosted_pdf, externals


def ensure_pdf_response(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    if "pdf" in ctype:
        return True
    return resp.content[:5] == b"%PDF-"


def download_file(url: str, dest: Path, referer: str, delay: float = 0.0):
    last_exc = None
    headers = dict(HEADERS_BASE)
    headers["Referer"] = referer
    for _ in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=headers, timeout=TIMEOUT)
            r.raise_for_status()
            if not ensure_pdf_response(r):
                raise ValueError(
                    f"Non-PDF response (Content-Type={r.headers.get('Content-Type')})"
                )
            dest.parent.mkdir(parents=True, exist_ok=True)
            with open(dest, "wb") as f:
                f.write(r.content)
            if delay > 0:
                time.sleep(delay)
            return
        except Exception as e:
            last_exc = e
            time.sleep(RETRY_SLEEP)
    raise last_exc


def main():
    parser = argparse.ArgumentParser(
        description="Download IPR PDFs from a year archive page (strict hosted-PDF matching with News/Opinion auto-skip)"
    )
    parser.add_argument("archive_url", nargs="?", help="Year archive URL, e.g., https://policyreview.info/archives/2020")
    parser.add_argument("--dry-run", action="store_true", help="List items and chosen hosted PDF without downloading")
    parser.add_argument("--max", type=int, default=0, help="Download at most N PDFs")
    parser.add_argument("--delay", type=float, default=0.0, help="Seconds to sleep between downloads")
    parser.add_argument("--allow-external-pdfs", action="store_true", help="If no hosted PDF, attempt first external PDF")
    parser.add_argument("--include-opinion", action="store_true", help="Include opinion/news items (overrides auto-skip)")
    parser.add_argument("--debug-pdf", action="store_true", help="Print debug info about found PDFs per article")
    args = parser.parse_args()

    archive_url = args.archive_url or input(
        "Paste IPR year archive URL (e.g., https://policyreview.info/archives/2020): "
    ).strip()
    if not archive_url:
        print("ERROR: No URL provided.")
        sys.exit(1)

    print(f"[INFO] Fetching archive page: {archive_url}")
    archive_soup = get_soup(archive_url)
    base_url = detect_base(archive_url, archive_soup)
    year = parse_year(archive_url, archive_soup)

    out_folder = SCRIPT_DIR / f"IPR_{year}"
    out_folder.mkdir(parents=True, exist_ok=True)
    log_path = out_folder / f"IPR_{year}_log.csv"

    items = collect_items_from_archive(archive_soup, base_url, include_opinion=args.include_opinion)
    print(f"[INFO] Found {len(items)} archive entries (after auto-skip of News/Opinion unless overridden)")

    if args.dry_run:
        for idx, (title, article_url, _classes) in enumerate(items, 1):
            hosted_pdf, externals = choose_hosted_pdf(article_url, year, debug_pdf=args.debug_pdf)
            print(f"[{idx}] {title}\n    Article : {article_url}\n    Hosted  : {hosted_pdf or '[NONE]'}")
            if externals:
                for eidx, ext in enumerate(externals, 1):
                    print(f"    External[{eidx}]: {ext}")
        print("[DRY-RUN] No downloads performed.")
        return

    saved = 0
    with open(log_path, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Title", "Article URL", "Hosted PDF", "External PDFs (if any)", "Filename", "Status"])

        for idx, (title, article_url, _classes) in enumerate(items, 1):
            if args.max and saved >= args.max:
                break

            hosted_pdf, externals = choose_hosted_pdf(article_url, year, debug_pdf=args.debug_pdf)
            pdf_url = hosted_pdf

            status_note = ""
            if not pdf_url and args.allow_external_pdfs:
                pdf_url = externals[0] if externals else ""
                status_note = " (external)"
            if not pdf_url:
                writer.writerow([title, article_url, hosted_pdf or "", ";".join(externals), "", "Skipped (no hosted PDF)"])
                print(f"[{idx}] ⚠️ No hosted PDF: {title}")
                continue

            safe_title = sanitize_filename(title)
            outfile = out_folder / f"{safe_title}.pdf"

            if outfile.exists() and outfile.stat().st_size > 0:
                writer.writerow([title, article_url, hosted_pdf or "", ";".join(externals), outfile.name, "Exists"])
                print(f"[{idx}] ✅ Exists: {outfile.name}")
                continue

            print(f"[{idx}] Downloading: {safe_title}{status_note}")
            try:
                download_file(pdf_url, outfile, referer=article_url, delay=args.delay)
                writer.writerow([title, article_url, hosted_pdf or "", ";".join(externals), outfile.name, "OK"])
                print(f"    ✅ Saved: {outfile.name}")
                saved += 1
            except Exception as e:
                writer.writerow([title, article_url, hosted_pdf or "", ";".join(externals), outfile.name, f"Error: {e}"])
                print(f"    ❌ Error: {e}")

    print(f"\nDone! {saved} PDFs saved in {out_folder}")
    print(f"Log: {log_path}")


if __name__ == "__main__":
    main()
